#Loading libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggthemes)
dataset_crime <- read_csv("crime_2020_2023.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 89335 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): Date Rptd, DATE OCC, AREA NAME, Crm Cd Desc, Category, Vict Sex, ...
## dbl (7): DR_NO, Vict Age, Crm Cd 1, Crm Cd 2, Crm Cd 3, LAT, LON
## lgl (1): Crm Cd 4
## time (1): TIME OCC
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dataset_weather <- read_csv("weather_2020_2023.csv")
## Rows: 1461 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): temp, feelslike, humidity, windspeed
## date (1): datetime
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(dataset_crime)
## spc_tbl_ [89,335 × 20] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ DR_NO : num [1:89335] 2.00e+08 2.02e+08 2.20e+08 2.22e+08 2.02e+08 ...
## $ Date Rptd : chr [1:89335] "09/04/2020 12:00:00 AM" "03/06/2020 12:00:00 AM" "01/14/2022 12:00:00 AM" "08/03/2022 12:00:00 AM" ...
## $ DATE OCC : chr [1:89335] "08/30/2020 12:00:00 AM" "03/06/2020 12:00:00 AM" "01/11/2022 12:00:00 AM" "08/03/2022 12:00:00 AM" ...
## $ TIME OCC : 'hms' num [1:89335] 11:00:00 22:30:00 08:00:00 00:01:00 ...
## ..- attr(*, "units")= chr "secs"
## $ AREA NAME : chr [1:89335] "Southwest" "Devonshire" "Southwest" "Mission" ...
## $ Crm Cd Desc : chr [1:89335] "VEHICLE - STOLEN" "BURGLARY" "THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER)" "THEFT OF IDENTITY" ...
## $ Category : chr [1:89335] "THEFT" "BURGLARY" "THEFT" "THEFT" ...
## $ Vict Age : num [1:89335] 0 47 0 43 46 26 35 36 26 30 ...
## $ Vict Sex : chr [1:89335] NA "F" NA "F" ...
## $ Vict Descent: chr [1:89335] NA "A" NA "H" ...
## $ Premis Desc : chr [1:89335] "STREET" "SINGLE FAMILY DWELLING" "STREET" "SINGLE FAMILY DWELLING" ...
## $ Weapon Desc : chr [1:89335] NA NA NA NA ...
## $ Status Desc : chr [1:89335] "Invest Cont" "Invest Cont" "Invest Cont" "Invest Cont" ...
## $ Crm Cd 1 : num [1:89335] 510 310 420 354 354 230 230 624 210 230 ...
## $ Crm Cd 2 : num [1:89335] NA 998 NA NA NA NA NA NA NA NA ...
## $ Crm Cd 3 : num [1:89335] NA NA NA NA NA NA NA NA NA NA ...
## $ Crm Cd 4 : logi [1:89335] NA NA NA NA NA NA ...
## $ LOCATION : chr [1:89335] "2000 W 43RD ST" "12200 LOUISE AV" "2100 S HOOVER ST" "10000 N SEPULVEDA BL" ...
## $ LAT : num [1:89335] 34 34.3 34 34.3 34 ...
## $ LON : num [1:89335] -118 -119 -118 -118 -118 ...
## - attr(*, "spec")=
## .. cols(
## .. DR_NO = col_double(),
## .. `Date Rptd` = col_character(),
## .. `DATE OCC` = col_character(),
## .. `TIME OCC` = col_time(format = ""),
## .. `AREA NAME` = col_character(),
## .. `Crm Cd Desc` = col_character(),
## .. Category = col_character(),
## .. `Vict Age` = col_double(),
## .. `Vict Sex` = col_character(),
## .. `Vict Descent` = col_character(),
## .. `Premis Desc` = col_character(),
## .. `Weapon Desc` = col_character(),
## .. `Status Desc` = col_character(),
## .. `Crm Cd 1` = col_double(),
## .. `Crm Cd 2` = col_double(),
## .. `Crm Cd 3` = col_double(),
## .. `Crm Cd 4` = col_logical(),
## .. LOCATION = col_character(),
## .. LAT = col_double(),
## .. LON = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(dataset_crime)
## DR_NO Date Rptd DATE OCC TIME OCC
## Min. :200100508 Length:89335 Length:89335 Length:89335
## 1st Qu.:210306848 Class :character Class :character Class1:hms
## Median :220313378 Mode :character Mode :character Class2:difftime
## Mean :217004722 Mode :numeric
## 3rd Qu.:230206364
## Max. :242104022
##
## AREA NAME Crm Cd Desc Category Vict Age
## Length:89335 Length:89335 Length:89335 Min. :-2.00
## Class :character Class :character Class :character 1st Qu.: 6.00
## Mode :character Mode :character Mode :character Median :31.00
## Mean :29.63
## 3rd Qu.:45.00
## Max. :99.00
##
## Vict Sex Vict Descent Premis Desc Weapon Desc
## Length:89335 Length:89335 Length:89335 Length:89335
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Status Desc Crm Cd 1 Crm Cd 2 Crm Cd 3
## Length:89335 Min. :110.0 Min. :210.0 Min. :626.0
## Class :character 1st Qu.:331.0 1st Qu.:998.0 1st Qu.:998.0
## Mode :character Median :442.0 Median :998.0 Median :998.0
## Mean :500.6 Mean :957.7 Mean :983.9
## 3rd Qu.:626.0 3rd Qu.:998.0 3rd Qu.:998.0
## Max. :956.0 Max. :999.0 Max. :999.0
## NA's :82702 NA's :89109
## Crm Cd 4 LOCATION LAT LON
## Mode:logical Length:89335 Min. : 0.00 Min. :-118.7
## NA's:89335 Class :character 1st Qu.:34.01 1st Qu.:-118.4
## Mode :character Median :34.06 Median :-118.3
## Mean :33.98 Mean :-118.0
## 3rd Qu.:34.16 3rd Qu.:-118.3
## Max. :34.33 Max. : 0.0
##
#Checking number of variables
ncol(dataset_crime)
## [1] 20
🧐 ➡️ There are 20 variables in Crime datase
str(dataset_weather)
## spc_tbl_ [1,461 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ datetime : Date[1:1461], format: "2020-01-01" "2020-01-02" ...
## $ temp : num [1:1461] 15.2 14.8 16.2 14.7 14.5 17.1 17.6 14.3 14 13.3 ...
## $ feelslike: num [1:1461] 15.2 14.8 16.2 14.7 14.5 17.1 17.6 14.3 14 13.3 ...
## $ humidity : num [1:1461] 48.9 69.6 53 63.6 59.1 34.9 32.3 68.1 58.3 61.2 ...
## $ windspeed: num [1:1461] 12 11.1 8.1 6.6 8.6 7.4 7.8 9.8 12.7 10 ...
## - attr(*, "spec")=
## .. cols(
## .. datetime = col_date(format = ""),
## .. temp = col_double(),
## .. feelslike = col_double(),
## .. humidity = col_double(),
## .. windspeed = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(dataset_weather)
## datetime temp feelslike humidity
## Min. :2020-01-01 Min. : 8.3 Min. : 7.50 Min. :15.00
## 1st Qu.:2020-12-31 1st Qu.:15.1 1st Qu.:15.10 1st Qu.:55.20
## Median :2021-12-31 Median :18.1 Median :18.00 Median :66.50
## Mean :2021-12-31 Mean :18.2 Mean :18.18 Mean :62.21
## 3rd Qu.:2022-12-31 3rd Qu.:21.1 3rd Qu.:21.10 3rd Qu.:71.80
## Max. :2023-12-31 Max. :32.4 Max. :32.00 Max. :93.90
## windspeed
## Min. : 1.30
## 1st Qu.:10.10
## Median :12.50
## Mean :13.58
## 3rd Qu.:16.60
## Max. :42.80
#Checking number of variables
ncol(dataset_weather)
## [1] 5
🧐 ➡ There are 5 variables in Weather dataset
nrow(dataset_crime)
## [1] 89335
🧐 ➡ There are 89335 observations in Crime dataset
nrow(dataset_weather)
## [1] 1461
🧐 ➡ There are 1461 observations in Weather dataset
sapply(dataset_crime, class)
## $DR_NO
## [1] "numeric"
##
## $`Date Rptd`
## [1] "character"
##
## $`DATE OCC`
## [1] "character"
##
## $`TIME OCC`
## [1] "hms" "difftime"
##
## $`AREA NAME`
## [1] "character"
##
## $`Crm Cd Desc`
## [1] "character"
##
## $Category
## [1] "character"
##
## $`Vict Age`
## [1] "numeric"
##
## $`Vict Sex`
## [1] "character"
##
## $`Vict Descent`
## [1] "character"
##
## $`Premis Desc`
## [1] "character"
##
## $`Weapon Desc`
## [1] "character"
##
## $`Status Desc`
## [1] "character"
##
## $`Crm Cd 1`
## [1] "numeric"
##
## $`Crm Cd 2`
## [1] "numeric"
##
## $`Crm Cd 3`
## [1] "numeric"
##
## $`Crm Cd 4`
## [1] "logical"
##
## $LOCATION
## [1] "character"
##
## $LAT
## [1] "numeric"
##
## $LON
## [1] "numeric"
sapply(dataset_weather, class)
## datetime temp feelslike humidity windspeed
## "Date" "numeric" "numeric" "numeric" "numeric"
“Date Rptd” & DATE OCC” are stored as character, instead of date data “Category” is stored as character, but could be better if being stored as categorical data “Vict Age” is stored as numeric, but there might be error related to negative value due to human error when inputting
No potential issues founded
filtered_crime_missing_value <- sapply(dataset_crime, function(x) sum(is.na(x)))
filtered_crime_missing_value
## DR_NO Date Rptd DATE OCC TIME OCC AREA NAME Crm Cd Desc
## 0 0 0 0 0 0
## Category Vict Age Vict Sex Vict Descent Premis Desc Weapon Desc
## 0 0 11655 11655 54 58054
## Status Desc Crm Cd 1 Crm Cd 2 Crm Cd 3 Crm Cd 4 LOCATION
## 0 0 82702 89109 89335 0
## LAT LON
## 0 0
# Filter only variables with missing value
filtered_crime_missing_value[filtered_crime_missing_value > 0]
## Vict Sex Vict Descent Premis Desc Weapon Desc Crm Cd 2 Crm Cd 3
## 11655 11655 54 58054 82702 89109
## Crm Cd 4
## 89335
filtered_weather_missing_value <- sapply(dataset_weather, function(x) sum(is.na(x)))
filtered_weather_missing_value
## datetime temp feelslike humidity windspeed
## 0 0 0 0 0
# Filter only variables with missing value
filtered_crime_missing_value[filtered_weather_missing_value > 0]
## named integer(0)
🧐 ➡ There is no missing value in Weather dataset❗
# Calculating percentage of containing missing value of each variable
missing_value_crime_percentage <- sapply(dataset_crime, function(x) sum(is.na(x)) / length(x) * 100)
# Sorting variables with missing value from highest to lowest
sort(missing_value_crime_percentage[missing_value_crime_percentage > 0], decreasing = TRUE)
## Crm Cd 4 Crm Cd 3 Crm Cd 2 Weapon Desc Vict Sex Vict Descent
## 100.00000000 99.74701965 92.57513852 64.98460850 13.04639839 13.04639839
## Premis Desc
## 0.06044663
dataset_crime$`Vict Age` <- as.numeric(dataset_crime$`Vict Age`)
ggplot(dataset_crime, aes(`Vict Age`)) +
geom_histogram(fill = "pink", color = "black", binwidth = 5) +
labs(title="Distribution of Victim Age",
x="Age",
y="Number of cases") +
theme_minimal()
ggplot(dataset_crime, aes(x = `Vict Descent`)) +
geom_bar(fill = "lightblue", color = "black") +
labs(title = "Distribution of Victim Descent",
x = "Victim Descent",
y = "Number of Cases") +
theme_minimal()
# Calculating percentage of each gender
vict_gender_counts <- dataset_crime %>%
count(`Vict Sex`) %>%
mutate(`Vict Gender Percentage` = n / sum(n) * 100)
# Plot the gender distribution using pie chart
ggplot(vict_gender_counts, aes(x = "", y = `Vict Gender Percentage`, fill = `Vict Sex`)) +
geom_bar(stat = "identity", width = 1, color = "black") +
coord_polar(theta = "y") +
geom_text(aes(label = paste0(round(`Vict Gender Percentage`, 1), "%")),
position = position_stack(vjust = 0.5), color = "white") +
labs(title = "Distribution of Victim Gender in Percentage",
x = "",
y = "",
fill = "Victim Gender") +
scale_fill_manual(values = c("lightblue","orange", "lightpink", "lightgreen")) +
theme_classic() +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.text.y = element_blank(),
axis.title.y = element_blank())
# Checking how many kind of status
table(dataset_crime$`Status Desc`)
##
## Adult Arrest Adult Other Invest Cont Juv Arrest Juv Other UNK
## 8195 10321 70347 311 160 1
# Counting number of cases of each status
case_status_counts <- dataset_crime %>%
count(`Status Desc`)
# Filter cases remain open
open_cases <- case_status_counts %>%
filter(`Status Desc` == "Invest Cont") %>%
pull(n)
# Plot the case status using bar chart
ggplot(case_status_counts, aes(x = `Status Desc`, y = n, fill = `Status Desc`)) +
geom_bar(stat = "identity", color = "black", width = 0.5) +
labs(title = "Distribution of Current Investigation Status",
x = "Case Status",
y = "Number of Cases") +
theme_minimal() +
theme(legend.position = "none")
✅ : - Provide all needed information of the data - Readable and understandable
❌ : - Could have been better if there is annotation to clarify exactly how many cases of each age group (see below for a revised version)
dataset_crime$AgeGroup <- cut(dataset_crime$`Vict Age`,
breaks = seq(0, 100, by = 10),
right = FALSE,
labels = paste(seq(0, 95, by = 10), seq(5, 100, by = 10), sep = "-"))
ggplot(dataset_crime, aes(AgeGroup)) +
geom_bar(fill = "blue", color = "black") +
labs(title="Distribution of Victim Age",
x="Age Group",
y="Number of cases") +
theme_minimal() +
geom_text(stat='count', aes(label=after_stat(count)), vjust=-0.5, size=3)
As we analyzed above, the variables having the most missing values are:
Furthermore, below are columns that will also be removed as they are irrelevant or less likely to be used in the analysis:
🧐 ➡ Therefore, we will remove those columns in the next step of cleaning the dataset.
# Convert column names to lowercase and replace spaces with "_"
colnames(dataset_crime) <- tolower(gsub(" ", "_", colnames(dataset_crime)))
print(colnames(dataset_crime))
## [1] "dr_no" "date_rptd" "date_occ" "time_occ" "area_name"
## [6] "crm_cd_desc" "category" "vict_age" "vict_sex" "vict_descent"
## [11] "premis_desc" "weapon_desc" "status_desc" "crm_cd_1" "crm_cd_2"
## [16] "crm_cd_3" "crm_cd_4" "location" "lat" "lon"
## [21] "agegroup"
# Filter columns that have many missing value, are irrelevant or less likely to be used in the analysis
dataset_crime_cleaned <- dataset_crime %>%
select(-crm_cd_2, -crm_cd_3, -crm_cd_4, -weapon_desc, -location, -premis_desc) %>%
# Rename the remaining columns
rename(
report_number = dr_no,
date_reported = date_rptd,
date_occurrence = date_occ,
time_occurrence = time_occ,
crime_code_description = crm_cd_desc,
victim_age = vict_age,
victim_sex = vict_sex,
victim_descent = vict_descent,
status_description = status_desc,
crime_code_1 = crm_cd_1,
latitude = lat,
longitude = lon
)
# Check the updated dataset
str(dataset_crime_cleaned)
## tibble [89,335 × 15] (S3: tbl_df/tbl/data.frame)
## $ report_number : num [1:89335] 2.00e+08 2.02e+08 2.20e+08 2.22e+08 2.02e+08 ...
## $ date_reported : chr [1:89335] "09/04/2020 12:00:00 AM" "03/06/2020 12:00:00 AM" "01/14/2022 12:00:00 AM" "08/03/2022 12:00:00 AM" ...
## $ date_occurrence : chr [1:89335] "08/30/2020 12:00:00 AM" "03/06/2020 12:00:00 AM" "01/11/2022 12:00:00 AM" "08/03/2022 12:00:00 AM" ...
## $ time_occurrence : 'hms' num [1:89335] 11:00:00 22:30:00 08:00:00 00:01:00 ...
## ..- attr(*, "units")= chr "secs"
## $ area_name : chr [1:89335] "Southwest" "Devonshire" "Southwest" "Mission" ...
## $ crime_code_description: chr [1:89335] "VEHICLE - STOLEN" "BURGLARY" "THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER)" "THEFT OF IDENTITY" ...
## $ category : chr [1:89335] "THEFT" "BURGLARY" "THEFT" "THEFT" ...
## $ victim_age : num [1:89335] 0 47 0 43 46 26 35 36 26 30 ...
## $ victim_sex : chr [1:89335] NA "F" NA "F" ...
## $ victim_descent : chr [1:89335] NA "A" NA "H" ...
## $ status_description : chr [1:89335] "Invest Cont" "Invest Cont" "Invest Cont" "Invest Cont" ...
## $ crime_code_1 : num [1:89335] 510 310 420 354 354 230 230 624 210 230 ...
## $ latitude : num [1:89335] 34 34.3 34 34.3 34 ...
## $ longitude : num [1:89335] -118 -119 -118 -118 -118 ...
## $ agegroup : Factor w/ 10 levels "0-5","10-15",..: 1 5 1 5 5 3 4 4 3 4 ...
# Replace any missing values with NA
dataset_crime_cleaned[is.na(dataset_crime_cleaned)] <- NA
# Check the updated dataset
str(dataset_crime_cleaned)
## tibble [89,335 × 15] (S3: tbl_df/tbl/data.frame)
## $ report_number : num [1:89335] 2.00e+08 2.02e+08 2.20e+08 2.22e+08 2.02e+08 ...
## $ date_reported : chr [1:89335] "09/04/2020 12:00:00 AM" "03/06/2020 12:00:00 AM" "01/14/2022 12:00:00 AM" "08/03/2022 12:00:00 AM" ...
## $ date_occurrence : chr [1:89335] "08/30/2020 12:00:00 AM" "03/06/2020 12:00:00 AM" "01/11/2022 12:00:00 AM" "08/03/2022 12:00:00 AM" ...
## $ time_occurrence : 'hms' num [1:89335] 11:00:00 22:30:00 08:00:00 00:01:00 ...
## ..- attr(*, "units")= chr "secs"
## $ area_name : chr [1:89335] "Southwest" "Devonshire" "Southwest" "Mission" ...
## $ crime_code_description: chr [1:89335] "VEHICLE - STOLEN" "BURGLARY" "THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER)" "THEFT OF IDENTITY" ...
## $ category : chr [1:89335] "THEFT" "BURGLARY" "THEFT" "THEFT" ...
## $ victim_age : num [1:89335] 0 47 0 43 46 26 35 36 26 30 ...
## $ victim_sex : chr [1:89335] NA "F" NA "F" ...
## $ victim_descent : chr [1:89335] NA "A" NA "H" ...
## $ status_description : chr [1:89335] "Invest Cont" "Invest Cont" "Invest Cont" "Invest Cont" ...
## $ crime_code_1 : num [1:89335] 510 310 420 354 354 230 230 624 210 230 ...
## $ latitude : num [1:89335] 34 34.3 34 34.3 34 ...
## $ longitude : num [1:89335] -118 -119 -118 -118 -118 ...
## $ agegroup : Factor w/ 10 levels "0-5","10-15",..: 1 5 1 5 5 3 4 4 3 4 ...
# Revise the date format
dataset_crime_cleaned <- dataset_crime_cleaned %>%
mutate(
date_reported = as.Date(date_reported, format = "%m/%d/%Y"),
date_occurrence = as.Date(date_occurrence, format = "%m/%d/%Y")
)
# Check the updated columns
str(dataset_crime_cleaned$date_reported)
## Date[1:89335], format: "2020-09-04" "2020-03-06" "2022-01-14" "2022-08-03" "2020-01-31" ...
str(dataset_crime_cleaned$date_occurrence)
## Date[1:89335], format: "2020-08-30" "2020-03-06" "2022-01-11" "2022-08-03" "2020-01-22" ...
# Check rows with missing or invalid geocoordinates
invalid_geo_coords <- dataset_crime_cleaned %>%
filter(
is.na(latitude) | is.na(longitude) |
latitude < -90 | latitude > 90 |
longitude < -180 | longitude > 180
)
# Check if there are any invalid rows and print them
if (nrow(invalid_geo_coords) > 0) {
print("Rows with invalid geocoordinates:")
print(invalid_geo_coords)
# Filter out rows with invalid geocoordinates values
dataset_crime_cleaned <- dataset_crime_cleaned %>%
filter(
!is.na(latitude) & !is.na(longitude) &
latitude >= -90 & latitude <= 90 &
longitude >= -180 & longitude <= 180
)
# Check the updated columns
str(dataset_crime_cleaned$latitude)
str(dataset_crime_cleaned$longitude)
} else {
print("No invalid geocoordinates found.")
}
## [1] "No invalid geocoordinates found."
🧐 ➡ There is no invalid in invalid geocoordinates❗
# Filter out rows with age = 0
dataset_crime_cleaned <- dataset_crime_cleaned %>%
filter(victim_age != 0)
# Check the updated column
str(dataset_crime_cleaned$victim_age)
## num [1:67224] 47 43 46 26 35 36 26 30 42 55 ...
#Check if there are any age = 0 entries
any(dataset_crime_cleaned$victim_age == 0)
## [1] FALSE
str(dataset_crime_cleaned)
## tibble [67,224 × 15] (S3: tbl_df/tbl/data.frame)
## $ report_number : num [1:67224] 2.02e+08 2.22e+08 2.02e+08 2.22e+08 2.02e+08 ...
## $ date_reported : Date[1:67224], format: "2020-03-06" "2022-08-03" ...
## $ date_occurrence : Date[1:67224], format: "2020-03-06" "2022-08-03" ...
## $ time_occurrence : 'hms' num [1:67224] 22:30:00 00:01:00 12:00:00 15:00:00 ...
## ..- attr(*, "units")= chr "secs"
## $ area_name : chr [1:67224] "Devonshire" "Mission" "Southeast" "Foothill" ...
## $ crime_code_description: chr [1:67224] "BURGLARY" "THEFT OF IDENTITY" "THEFT OF IDENTITY" "ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT" ...
## $ category : chr [1:67224] "BURGLARY" "THEFT" "THEFT" "ASSAULT" ...
## $ victim_age : num [1:67224] 47 43 46 26 35 36 26 30 42 55 ...
## $ victim_sex : chr [1:67224] "F" "F" "F" "F" ...
## $ victim_descent : chr [1:67224] "A" "H" "H" "H" ...
## $ status_description : chr [1:67224] "Invest Cont" "Invest Cont" "Invest Cont" "Invest Cont" ...
## $ crime_code_1 : num [1:67224] 310 354 354 230 230 624 210 230 330 341 ...
## $ latitude : num [1:67224] 34.3 34.3 34 34.3 34.3 ...
## $ longitude : num [1:67224] -119 -118 -118 -118 -118 ...
## $ agegroup : Factor w/ 10 levels "0-5","10-15",..: 5 5 5 3 4 4 3 4 5 6 ...
# Check the class of the datetime column, format if needed
if (class(dataset_weather$datetime) != "Date") {
dataset_weather$datetime <- as.Date(dataset_weather$datetime, format="%Y-%m-%d")
# Check the updated columns
str(dataset_weather$datetime)
} else {
print("The datetime column is already in Date format.")
}
## [1] "The datetime column is already in Date format."
#Line & Point plot
ggplot(dataset_crime_cleaned, aes(x = `date_occurrence`)) +
geom_line(stat = "count", color = "navy") +
geom_point(stat = "count", color = "red", size = 0.15) +
scale_x_date(
breaks = "6 months",
labels = scales::date_format("%Y-%m")
) +
labs(title = "Number of Cases Over Time",
x = "Date",
y = "Number of Cases") +
theme_minimal()
#Barplot
ggplot(dataset_crime_cleaned, aes(x = `date_occurrence`)) +
geom_bar(fill = "navy") +
scale_x_date(
breaks = "6 months",
labels = scales::date_format("%Y-%m")
) +
labs(title = "Number of Cases Over Time",
x = "Date",
y = "Number of Cases") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 0, hjust = 0.5))
#Dark Theme
ggplot(dataset_crime_cleaned, aes(x = `date_occurrence`)) +
geom_line(stat = "count") +
scale_x_date(
breaks = "6 months",
labels = scales::date_format("%Y-%m")
) +
labs(title = "Number of Cases Over Time",
x = "Date",
y = "Number of Cases") +
theme_dark()
#Excel Theme
ggplot(dataset_crime_cleaned, aes(x = `date_reported`)) +
geom_line(stat = "count", color = "darkgreen") +
scale_x_date(
breaks = "6 months",
labels = scales::date_format("%Y-%m")
) +
labs(title = "Number of Cases Over Time",
x = "Date",
y = "Number of Cases") +
theme_excel()
install.packages("zoo")
##
## The downloaded binary packages are in
## /var/folders/xk/9mmr645s72g8yf1651fpqg740000gn/T//RtmpUGS7x8/downloaded_packages
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# Count number of cases per day
daily_cases <- dataset_crime_cleaned %>%
group_by(date_occurrence) %>%
summarise(Number_of_Cases = n()) %>%
arrange(date_occurrence)
# Compute the 30-day rolling mean
daily_cases <- daily_cases %>%
mutate(RollingMean_30d = rollmean(Number_of_Cases, k = 30, fill = NA, align = "right"))
# Plot the rolling mean over time
ggplot(na.omit(daily_cases), aes(x = date_occurrence)) +
geom_line(aes(y = RollingMean_30d), color = "red", linewidth = 0.25) +
labs(title = "Number of Cases Over Time (Rolling Mean)",
x = "Date",
y = "Number of Cases") +
scale_x_date(
breaks = seq(min(dataset_crime_cleaned$date_occurrence),
max(dataset_crime_cleaned$date_occurrence),
by = "6 months"),
labels = scales::date_format("%Y-%m")
) +
theme_minimal()
dataset_crime_cleaned$date_occurrence <- as.Date(dataset_crime_cleaned$date_occurrence, format = "%m/%d/%Y")
# The five most affected victim descents
top_victims <- dataset_crime_cleaned %>%
group_by(victim_descent) %>%
summarise(Number_of_Cases = n()) %>%
top_n(5, Number_of_Cases) %>%
pull(victim_descent)
# Filter the dataset to include only the top 5 victim descents
filtered_data <- dataset_crime_cleaned %>%
filter(victim_descent %in% top_victims)
# Count number of cases per day, grouped by victim descent
daily_cases <- filtered_data %>%
group_by(date_occurrence, victim_descent) %>%
summarise(Number_of_Cases = n(), .groups = "drop") %>%
arrange(date_occurrence)
# Compute the 30-day rolling mean
daily_cases <- daily_cases %>%
group_by(victim_descent) %>%
mutate(RollingMean_30d = rollmean(Number_of_Cases, k = 30, fill = NA, align = "right"))
# Plot the rolling mean over time for each of the five victim descents
ggplot(na.omit(daily_cases), aes(x = date_occurrence, y = RollingMean_30d, color = victim_descent, group = victim_descent)) +
geom_line(linewidth = 0.25) +
labs(title = "30-Day Rolling Mean of Cases Over Time by Top Five Victim Descent",
x = "Date",
y = "Number of Cases (Rolling Mean)",
color = "Victim Descent") +
scale_x_date(
breaks = seq(min(daily_cases$date_occurrence), max(daily_cases$date_occurrence), by = "6 months"),
labels = scales::date_format("%Y-%m")
) +
theme_minimal()
# Count the number of crime types
crime_types <- length(unique(dataset_crime_cleaned$category))
# Show the result
print(paste("Number of crime types:", `crime_types`))
## [1] "Number of crime types: 14"
# Count the number of cases for each crime type
crime_types_counts <- dataset_crime_cleaned %>%
group_by(category) %>%
summarise(Number_of_Cases = n(), .groups = "drop")
crime_types_counts
## # A tibble: 14 × 2
## category Number_of_Cases
## <chr> <int>
## 1 ASSAULT 19085
## 2 BURGLARY 9378
## 3 CHILDREN 484
## 4 COURT/LEGAL VIOLATIONS 2029
## 5 FRAUD/FORGERY 298
## 6 HOMICIDE 174
## 7 OTHER 2101
## 8 ROBBERY 2816
## 9 SEX CRIMES 1757
## 10 THEFT 19839
## 11 THREATS 3212
## 12 VANDALISM 5969
## 13 VEHICLE CRIMES 7
## 14 WEAPONS 75
# Count the occurrences of each crime type
crime_counts <- dataset_crime_cleaned %>%
count(category) %>%
arrange(desc(n)) %>%
top_n(10, n)
# Plot the ten most common crime types
ggplot(crime_counts, aes(x = reorder(category, n), y = n)) +
geom_bar(stat = "identity", fill = "purple", color = "black", width = 0.5) +
labs(title = "Top 10 Most Common Crime Types",
x = "Crime Type",
y = "Number of Cases") +
coord_flip() + # Flip the axes for better readability
theme_minimal()
# Remove NA values and count the number of cases by crime type and gender
crime_gender_counts <- dataset_crime_cleaned %>%
filter(category %in% names(sort(table(dataset_crime_cleaned$category), decreasing = TRUE)[1:10])) %>%
filter(!is.na(victim_sex)) %>%
group_by(category, victim_sex) %>%
summarize(Number_of_Cases = n(), .groups = "drop")
# Calculate the percentage of men and women for each crime type
crime_gender_percentage <- crime_gender_counts %>%
group_by(category) %>%
mutate(Percentage = (Number_of_Cases / sum(Number_of_Cases)) * 100)
# Plot the percentage of men and women for the 10 most common crime types
ggplot(crime_gender_percentage, aes(x = reorder(category, Percentage), y = Percentage, fill = victim_sex)) +
geom_bar(stat = "identity", position = "stack", color = "black", width = 0.5) +
scale_fill_manual(values = c("pink", "lightblue", "lightgreen", "yellow")) +
labs(title = "Distribution of Gender for Top 10 Crime Types",
x = "Crime Type",
y = "Percentage",
fill = "Gender") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Group by gender and crime category & Count the number of occurrences
crime_gender_counts <- dataset_crime_cleaned %>%
filter(!is.na(victim_sex)) %>%
group_by(victim_sex, category) %>%
summarise(count = n(), .groups = "drop")
# Create the plot
ggplot(crime_gender_counts, aes(x = reorder(category, count), y = count, fill = victim_sex)) +
geom_bar(stat = "identity", position = "dodge", color = "black") +
labs(
title = "Distribution of Crime Types by Gender",
x = "Crime Type",
y = "Numnber of Cases",
fill = "Victim Gender"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Convert 'date_reported' to Date type (assuming 'date_reported' is the date column)
dataset_crime_cleaned$date_reported <- as.Date(dataset_crime_cleaned$date_occurrence, format = "%m/%d/%Y")
# Count the number of crimes per day
crime_count_per_day <- dataset_crime_cleaned %>%
group_by(date_occurrence) %>%
summarise(crime_count = n(), .groups = "drop")
# Show the result
crime_count_per_day
## # A tibble: 1,461 × 2
## date_occurrence crime_count
## <date> <int>
## 1 2020-01-01 105
## 2 2020-01-02 48
## 3 2020-01-03 59
## 4 2020-01-04 44
## 5 2020-01-05 32
## 6 2020-01-06 41
## 7 2020-01-07 35
## 8 2020-01-08 37
## 9 2020-01-09 37
## 10 2020-01-10 51
## # ℹ 1,451 more rows
dataset_weather$datetime <- as.Date(dataset_weather$datetime, format = "%m/%d/%Y")
merged_data <- left_join(crime_count_per_day, dataset_weather, by = c("date_occurrence" = "datetime"))
merged_data
## # A tibble: 1,461 × 6
## date_occurrence crime_count temp feelslike humidity windspeed
## <date> <int> <dbl> <dbl> <dbl> <dbl>
## 1 2020-01-01 105 15.2 15.2 48.9 12
## 2 2020-01-02 48 14.8 14.8 69.6 11.1
## 3 2020-01-03 59 16.2 16.2 53 8.1
## 4 2020-01-04 44 14.7 14.7 63.6 6.6
## 5 2020-01-05 32 14.5 14.5 59.1 8.6
## 6 2020-01-06 41 17.1 17.1 34.9 7.4
## 7 2020-01-07 35 17.6 17.6 32.3 7.8
## 8 2020-01-08 37 14.3 14.3 68.1 9.8
## 9 2020-01-09 37 14 14 58.3 12.7
## 10 2020-01-10 51 13.3 13.3 61.2 10
## # ℹ 1,451 more rows
merged_data$date_occurrence <- as.Date(merged_data$date_occurrence)
# Calculate 30-day rolling mean for crime count per day and temperature
merged_data <- merged_data %>%
arrange(date_occurrence) %>%
mutate(
rolling_crime_count = rollapply(crime_count, width = 30, FUN = mean, fill = NA, align = "right"),
rolling_temperature = rollapply(temp, width = 30, FUN = mean, fill = NA, align = "right")
)
coeff <- 0.47
# Create the plot
ggplot(na.omit(merged_data), aes(x = date_occurrence)) +
geom_line(aes(y = rolling_crime_count, color = "Rolling Crime Count"), linewidth = 0.25) +
geom_line(aes(y = rolling_temperature / coeff, color = "Rolling Temperature"), linewidth = 0.25) +
labs(title = "Crime Count and Temperature Over Time (Rolling Mean)",
x = "Date",
y = "Rolling Mean Value",
color = "Legend") +
scale_color_manual(values = c("Rolling Crime Count" = "red", "Rolling Temperature" = "orange")) +
scale_y_continuous(
name = "Crime Count (Rolling Mean)",
breaks = seq(0, max(merged_data$rolling_crime_count, na.rm = TRUE), by = 10),
sec.axis = sec_axis(~ .*coeff, name = "Temperature (°C Rolling Mean)")
) +
scale_x_date(
breaks = seq(min(merged_data$date_occurrence), max(merged_data$date_occurrence), by = "6 months"),
labels = scales::date_format("%Y-%m")
) +
theme_minimal() +
theme(axis.text.y.left = element_text(margin = margin(l = 10)),
axis.text.y.right = element_text(margin = margin(r = 10)),
axis.title.y.right = element_text(angle = 90),
legend.title = element_blank())
lm_model <- lm(crime_count ~ temp, data = merged_data)
summary(lm_model)
##
## Call:
## lm(formula = crime_count ~ temp, data = merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.155 -6.507 -1.221 4.910 59.647
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.0108 1.2590 33.369 < 2e-16 ***
## temp 0.2199 0.0676 3.253 0.00117 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.21 on 1459 degrees of freedom
## Multiple R-squared: 0.007199, Adjusted R-squared: 0.006518
## F-statistic: 10.58 on 1 and 1459 DF, p-value: 0.00117
install.packages("olsrr")
##
## The downloaded binary packages are in
## /var/folders/xk/9mmr645s72g8yf1651fpqg740000gn/T//RtmpUGS7x8/downloaded_packages
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
lm_model <- lm(crime_count ~ temp, data = merged_data)
ols_regress(lm_model)
## Model Summary
## -----------------------------------------------------------------
## R 0.085 RMSE 10.205
## R-Squared 0.007 MSE 104.136
## Adj. R-Squared 0.007 Coef. Var 22.193
## Pred R-Squared 0.004 AIC 10939.502
## MAE 7.509 SBC 10955.363
## -----------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
## AIC: Akaike Information Criteria
## SBC: Schwarz Bayesian Criteria
##
## ANOVA
## ------------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## ------------------------------------------------------------------------
## Regression 1103.145 1 1103.145 10.579 0.0012
## Residual 152142.633 1459 104.279
## Total 153245.778 1460
## ------------------------------------------------------------------------
##
## Parameter Estimates
## ----------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## ----------------------------------------------------------------------------------------
## (Intercept) 42.011 1.259 33.369 0.000 39.541 44.480
## temp 0.220 0.068 0.085 3.253 0.001 0.087 0.352
## ----------------------------------------------------------------------------------------
1. Regression Equation
The estimated regression equation is:
y = 42.0108 + 0.2199 x temp
y = Predicted crime count
Intercept (42.0108): When temperature = 0°C, the expected crime count is approximately 42 crimes per day
Slope (0.2199): For each 1°C increase in temperature, the crime count increases by 0.22 crimes per day on average
2. Statistical Significance
3. Goodness of Fit (R-squared)
4. Residual Analysis
5. Conclusion
Temperature significantly affects crime count, but the effect size is small (0.22 crimes per °C). The model is weak at predicting crime count (low R² ≈ 0.72%), meaning other factors must be included for better accuracy, such as day of the week, seasonality, holidays, or socioeconomic factors.
A relationship is causal if changing one variable directly causes a change in another.
The analysis suggests that temperature and crime count are associated, but correlation does not imply causation:
Adding variables: “feelslike”, “humidity”, “windspeed” to the regression
lm_model_multi <- lm(crime_count ~ temp + feelslike + humidity + windspeed, data = merged_data)
summary(lm_model_multi)
##
## Call:
## lm(formula = crime_count ~ temp + feelslike + humidity + windspeed,
## data = merged_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.938 -6.547 -1.164 4.923 59.615
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.745116 1.831131 22.797 <2e-16 ***
## temp -2.301172 1.839047 -1.251 0.211
## feelslike 2.514220 1.835962 1.369 0.171
## humidity -0.001949 0.019277 -0.101 0.919
## windspeed 0.041389 0.054268 0.763 0.446
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.21 on 1456 degrees of freedom
## Multiple R-squared: 0.008958, Adjusted R-squared: 0.006236
## F-statistic: 3.29 on 4 and 1456 DF, p-value: 0.01075
lm_model_multi <- lm(crime_count ~ temp + feelslike + humidity + windspeed, data = merged_data)
ols_regress(lm_model_multi)
## Model Summary
## -----------------------------------------------------------------
## R 0.095 RMSE 10.196
## R-Squared 0.009 MSE 103.951
## Adj. R-Squared 0.006 Coef. Var 22.197
## Pred R-Squared 0.002 AIC 10942.910
## MAE 7.504 SBC 10974.631
## -----------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
## AIC: Akaike Information Criteria
## SBC: Schwarz Bayesian Criteria
##
## ANOVA
## -----------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -----------------------------------------------------------------------
## Regression 1372.837 4 343.209 3.29 0.0107
## Residual 151872.942 1456 104.308
## Total 153245.778 1460
## -----------------------------------------------------------------------
##
## Parameter Estimates
## ----------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## ----------------------------------------------------------------------------------------
## (Intercept) 41.745 1.831 22.797 0.000 38.153 45.337
## temp -2.301 1.839 -0.888 -1.251 0.211 -5.909 1.306
## feelslike 2.514 1.836 0.972 1.369 0.171 -1.087 6.116
## humidity -0.002 0.019 -0.003 -0.101 0.919 -0.040 0.036
## windspeed 0.041 0.054 0.020 0.763 0.446 -0.065 0.148
## ----------------------------------------------------------------------------------------
1. Regression Equation
The estimated regression equation is:
y = 41.75 − 2.30 × temp + 2.51 × feelslike − 0.0019 × humidity + 0.0414 × windspeed
2. Statistical Significance
3. Goodness of Fit (R-squared)
4. Residual Analysis
5. Conclusion
temp and
feelslike at the same time? Why or why not?The feelslike (perceived temperature) is not a completely independent variable—it is calculated based on temp (actual temperature), along with humidity and wind speed.
Considering that, we should not include both temp and feelslike in the same regression model. *Since the two variables are highly correlated (feelslike is mathematically derived from temp, humidity, and windspeed), including both temp and feelslike in a regression model can cause multicollinearity (high correlation between independent variables).
We saw this in the regression results: when both variables temp and feelslike were included, neither was statistically significant (p-values > 0.05). This happens because their shared variance causes the model to distribute the explanatory power between them, weakening their individual effects.
High correlation also leads to inflated standard errors for both variables, reducing our confidence in their estimated effects.
The analysis shows that weather variables and crime count are correlated, but not causal
Using map plot to geographically represent the distribution of homicide in the Los Angeles area (Location: California county, United States of America)
library(ggplot2)
library(dplyr)
library(usmap)
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(mapdata)
library(ggplot2)
library(dplyr)
#Plot the map of California county
usa_county <- map_data("county")
usa_county_map <- usa_county %>%
rename(longitude = long,
latitude = lat)
california <- subset(usa_county_map, region=="california")
#Filter the dataset to select only homicide crimes
dataset_homicide <- dataset_crime_cleaned[dataset_crime_cleaned$category == "HOMICIDE", ]
#Merge homicide dataset with map of California by geographic coordinates
mapdata <- left_join(california, dataset_homicide, by = c("longitude", "latitude"))
#Plot the map of distribution of homicide in Los Angeles area
ggplot(mapdata, aes(x = longitude, y = latitude, group = group)) +
geom_polygon(data = california, aes(x = longitude, y = latitude, group = group),
color = "black", linewidth = 0.2, fill = "white") +
geom_point(data = dataset_homicide, aes(x = longitude, y = latitude, group = NULL),
fill = "red", color = "black", shape = 21, size = 2) +
#Limit the area to Los Angeles
coord_cartesian(xlim = c(-119.5, -117.0), ylim = c(33.7, 34.4)) +
labs(title = "Distribution of Homicide in Los Angeles Area",
x = "",
y = "",
color = "") +
theme_minimal()
library(ggplot2)
library(dplyr)
library(ggmap)
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
## Stadia Maps' Terms of Service: <https://stadiamaps.com/terms-of-service/>
## OpenStreetMap's Tile Usage Policy: <https://operations.osmfoundation.org/policies/tiles/>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
register_stadiamaps(key = "c9527f0a-1729-42db-ac61-8dfe590e5e5f")
# Define bounding box for Los Angeles
la_bbox <- c(left = -118.75, bottom = 33.7, right = -118.0, top = 34.4)
# Get OpenStreetMap tile for the Los Angeles area from Stadia Maps
la_map <- get_stadiamap(bbox = la_bbox, zoom = 11, maptype = "stamen_toner_lite")
## ℹ © Stadia Maps © Stamen Design © OpenMapTiles © OpenStreetMap contributors.
# Filter dataset to select only homicide crimes in Los Angeles
dataset_homicide <- dataset_crime_cleaned %>%
filter(category == "HOMICIDE") %>%
filter(longitude >= la_bbox["left"] & longitude <= la_bbox["right"],
latitude >= la_bbox["bottom"] & latitude <= la_bbox["top"])
# Plot the detailed street map with homicide locations
ggmap(la_map) +
geom_point(data = dataset_homicide, aes(x = longitude, y = latitude),
fill = "red", color = "black", shape = 21, size = 2) +
labs(title = "Distribution of Homicide in Los Angeles Area",
x = "",
y = "") +
theme_minimal()
install.packages("plotly")
##
## The downloaded binary packages are in
## /var/folders/xk/9mmr645s72g8yf1651fpqg740000gn/T//RtmpUGS7x8/downloaded_packages
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggmap':
##
## wind
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Count the occurrences for each crime category and filter the top 8
top_8_crimes <- dataset_crime_cleaned %>%
count(category) %>%
arrange(desc(n)) %>%
head(8)
# Extract the category names from top_8_crimes
top_8_categories <- top_8_crimes$category
# Filter the original dataset for only these top 8 categories
df <- dataset_crime_cleaned %>%
filter(category %in% top_8_categories)
m <- list(colorbar = list(title = "category"))
# Set up map layout
g <- list(
scope = 'america',
showland = TRUE,
landcolor = toRGB("grey83"),
subunitcolor = toRGB("white"),
countrycolor = toRGB("white"),
showlakes = TRUE,
lakecolor = toRGB("white"),
showsubunits = TRUE,
showcountries = TRUE,
resolution = 50,
projection = list(
type = 'conic conformal',
rotation = list(lon = -118.25)
),
lonaxis = list(
showgrid = TRUE,
gridwidth = 0.5,
range = c(-119, -117.8),
dtick = 0.2
),
lataxis = list(
showgrid = TRUE,
gridwidth = 0.5,
range = c(33.5, 34.5),
dtick = 0.2
)
)
# Create interactive map
fig <- plot_geo(df, lat = ~latitude, lon = ~longitude, color = ~category)
# Add crime markers
fig <- fig %>% add_markers(
text = ~paste("Crime:", df$category, "<br>Description:", df$crime_code_description, "<br>Location", df$area_name),
hoverinfo = "text")
# Final layout
fig <- fig %>% layout(title = "Crime Distribution in Los Angeles (Hover for Details)", geo = g)
fig